# -*- coding: utf-8 -*-
import datetime
import math
import scrapy
import re
from apps.tax_policy.tax_policy.items import NetTaxPolicyItem as Item, urllib, time, urljoin, json, urlencode
from apps.tax_policy.tax_policy.spiders.base_spider.base_tax_policy_spider import BaseTaxPolicySpider


class FujianZfxxgkPolicy(BaseTaxPolicySpider):
    name = 'fujian_xiamen_zfxxgk_policy'

    province = '福建省'
    city = '厦门市'
    # county = ''
    park = ''
    page_size = int(26 * 1.5)

    def __init__(self, get_next=False, **kwargs):
        super().__init__(**kwargs)
        print(get_next)
        self.get_next = get_next

    @classmethod
    def update_settings(cls, settings) -> None:
        downloader_middlewares = settings.getdict("DOWNLOADER_MIDDLEWARES")
        new_settings = {
            **(cls.custom_settings or {}),
            **{
                # "COOKIES_ENABLED": False,
                "DOWNLOAD_TIMEOUT": 200,
                # "CONCURRENT_REQUESTS": 2,
                "HTTPERROR_ALLOWED_CODES": [302, 400, 404, 407, 500, 200, 202, 502, 429, 521],
                # "DOWNLOAD_DELAY": 5,
                # "DOWNLOADER_MIDDLEWARES": {
                #     "components.middlewares.downloadmiddlewares.public.jsl_middlewares.JslMiddleware": 543,
                #     **downloader_middlewares
                # }
            },
        }
        settings.setdict(new_settings or {}, priority="spider")

    def start_requests(self):
        for source, county, url in [
            ["厦门市人民政府", "", "https://www.xm.gov.cn/zwgk/flfg/bmwj/index.htm"],
        ]:
            item = {'source': source, 'county': county}
            yield scrapy.Request(url, callback=self.parse_list, meta={"item": item})

    def parse_list(self, response, **kwargs):
        prev_item = response.meta.get('item')
        for elem in response.xpath(
            "//div[@class='gl_list1']//li/a/ancestor::li[string-length(string(.))>=10]"
        ):
            item = Item()
            item['source_url'] = elem.xpath(""".//a/@href""").re_first(r"""(.*/.*)""")
            item['source_url'] = response.urljoin(item['source_url'])
            item['publish_date'] = elem.xpath("""string(.)""").get()
            if prev_item is not None:
                for key, value in prev_item.items():
                    item[key] = value
            if not item['source_url']:
                continue
            # if '.htm' not in item['source_url'] and '.shtm' not in item['source_url'] and '.jhtm' not in item['source_url']:
            #     continue
            print(response.url, item['source_url'])
            yield response.follow(item['source_url'], callback=self.parse_detail, meta={'item': item})
        if response.meta.get("is_next") is not False:
            total_page = response.xpath(""".""").re_first(r"""pageCount\s*:\s*["\']?(\d+)\s*""")
            print(response.url, "page:", total_page)
            for page_num in range(1, int(total_page)):
                yield response.follow(response.url.replace("/index.htm", f"/index_{page_num}.htm"),
                                      callback=self.parse_list, meta={'item': prev_item, 'is_next': False})

    def parse_detail(self, response, **kwargs):
        item = Item() if response.meta.get('item') is None else response.meta.get('item')
        item['title'] = response.xpath("""string(//meta[@name='ArticleTitle']/@content)""").get()
        item['publish_date'] = response.xpath("""string(//meta[@name='PubDate']/@content)""").get() or response.xpath("//*[@class='fbrq']/text()").get()
        item['content'] = response.xpath(""".""").get()
        # item['source'] = self.source
        item['province'] = self.province
        item['city'] = self.city
        # item['county'] = self.county
        item['park'] = self.park
        yield item


if __name__ == "__main__":
    from scrapy import cmdline

    cmdline.execute("scrapy crawl fujian_xiamen_zfxxgk_policy -a get_next=0".split())
